Day3 - Here We R! - Regression using R

Kamran Khan & Didjier Masangwi

2024-11-21

Packages

Code
# List of required packages
required_packages <- c("tidyverse", "survival", "survminer", 
                       "ggplot2", "labelled", "gtsummary", "labelled", 
                       "purrr", "here", "EnvStats", "rstatix")

# Check and install missing packages
install_if_missing <- function(packages) {
  missing <- packages[!packages %in% installed.packages()[, "Package"]]
  if (length(missing) > 0) {
    install.packages(missing, dependencies = TRUE) # Include dependencies
    message("Installed missing packages: ", paste(missing, collapse = ", "))
  } else {
    message("All required packages are already installed.")
  }
}

# Run the function
install_if_missing(required_packages)

Loading the required libraries

Code
library(tidyverse)
library(survival)
library(survminer)
library(ggplot2)
library(gtsummary)
library(labelled)
library(purrr)
library(here)
library(EnvStats)
library(rstatix)

Create a clinical dataset

Code
set.seed(123)  # For reproducibility
clinical_trial_data <- tibble(
  patient_id = 1:150,
  age = rnorm(150, mean = 50, sd = 12),  # Patient age
  sex = sample(c("Male", "Female"), 150, replace = TRUE),  # Patient sex
  treatment_group = sample(c("Control", "Treatment A", "Treatment B"), 150, replace = TRUE),  # Treatment groups
  time_to_event = rexp(150, rate = 0.1),  # Time to event (survival data)
  event_occurred = sample(c(0, 1), 150, replace = TRUE),  # Event status (0 = censored, 1 = event occurred)
  baseline_score = rnorm(150, mean = 75, sd = 10),  # Baseline health score
  followup_score = rnorm(150, mean = 80, sd = 10)  # Follow-up health score
)

#Inspect the data 

glimpse(clinical_trial_data)
Rows: 150
Columns: 8
$ patient_id      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ age             <dbl> 43.27429, 47.23787, 68.70450, 50.84610, 51.55145, 70.5…
$ sex             <chr> "Female", "Female", "Male", "Female", "Male", "Male", …
$ treatment_group <chr> "Treatment A", "Control", "Treatment A", "Control", "T…
$ time_to_event   <dbl> 4.2537311, 14.3825042, 16.4363696, 2.9411470, 17.74436…
$ event_occurred  <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, …
$ baseline_score  <dbl> 80.50044, 87.36676, 76.39098, 79.10275, 69.41543, 81.0…
$ followup_score  <dbl> 60.38292, 68.80100, 66.72245, 71.46376, 73.06695, 83.8…

Data Manipulation and Visualization with tidyverse

Code
clinical_trial_data <- clinical_trial_data |> 
  mutate(Group = factor(treatment_group))

glimpse(clinical_trial_data)
Rows: 150
Columns: 9
$ patient_id      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ age             <dbl> 43.27429, 47.23787, 68.70450, 50.84610, 51.55145, 70.5…
$ sex             <chr> "Female", "Female", "Male", "Female", "Male", "Male", …
$ treatment_group <chr> "Treatment A", "Control", "Treatment A", "Control", "T…
$ time_to_event   <dbl> 4.2537311, 14.3825042, 16.4363696, 2.9411470, 17.74436…
$ event_occurred  <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, …
$ baseline_score  <dbl> 80.50044, 87.36676, 76.39098, 79.10275, 69.41543, 81.0…
$ followup_score  <dbl> 60.38292, 68.80100, 66.72245, 71.46376, 73.06695, 83.8…
$ Group           <fct> Treatment A, Control, Treatment A, Control, Treatment …

Select specific columns

Code
# Select specific columns
selected_data <- clinical_trial_data %>% 
  select(patient_id, age, treatment_group, baseline_score, followup_score)


head(selected_data)
# A tibble: 6 × 5
  patient_id   age treatment_group baseline_score followup_score
       <int> <dbl> <chr>                    <dbl>          <dbl>
1          1  43.3 Treatment A               80.5           60.4
2          2  47.2 Control                   87.4           68.8
3          3  68.7 Treatment A               76.4           66.7
4          4  50.8 Control                   79.1           71.5
5          5  51.6 Treatment A               69.4           73.1
6          6  70.6 Control                   81.1           83.8

Filtering

Code
# Filter patients in the Treatment A group
filtered_data <- clinical_trial_data %>% 
  filter(treatment_group == "Treatment A")

head(filtered_data)
# A tibble: 6 × 9
  patient_id   age sex    treatment_group time_to_event event_occurred
       <int> <dbl> <chr>  <chr>                   <dbl>          <dbl>
1          1  43.3 Female Treatment A              4.25              1
2          3  68.7 Male   Treatment A             16.4               0
3          5  51.6 Male   Treatment A             17.7               1
4          8  34.8 Male   Treatment A             24.6               0
5         11  64.7 Female Treatment A              1.75              1
6         13  54.8 Female Treatment A             20.0               0
# ℹ 3 more variables: baseline_score <dbl>, followup_score <dbl>, Group <fct>

Mutate

Code
# Add a new variable for change in health score
clinical_trial_data <- clinical_trial_data %>%
  mutate(score_change = followup_score - baseline_score)

ncol(clinical_trial_data)
[1] 10
Code
names(clinical_trial_data)
 [1] "patient_id"      "age"             "sex"             "treatment_group"
 [5] "time_to_event"   "event_occurred"  "baseline_score"  "followup_score" 
 [9] "Group"           "score_change"   

Plotting with ggplot2

Code
# Boxplot of follow-up scores by treatment group
ggplot(clinical_trial_data, aes(x = age, y = Group)) +
  geom_point() +
  labs(title = "Follow-up Scores by Treatment Group", x = "Treatment Group", y = "Follow-up Score")

Statistics using R

T-test

Two sample t-test (Student’s t-test) can be used if we have two independent (unrelated) groups (e.g., males-females, treatment-non treatment) and one quantitative variable of interest.

Code
# Filter for Control and Treatment A groups
t_test_data <- clinical_trial_data |> 
  filter(treatment_group %in% c("Control", "Treatment A"))

head(t_test_data)
# A tibble: 6 × 10
  patient_id   age sex    treatment_group time_to_event event_occurred
       <int> <dbl> <chr>  <chr>                   <dbl>          <dbl>
1          1  43.3 Female Treatment A              4.25              1
2          2  47.2 Female Control                 14.4               1
3          3  68.7 Male   Treatment A             16.4               0
4          4  50.8 Female Control                  2.94              0
5          5  51.6 Male   Treatment A             17.7               1
6          6  70.6 Male   Control                 37.5               1
# ℹ 4 more variables: baseline_score <dbl>, followup_score <dbl>, Group <fct>,
#   score_change <dbl>

Perform t-test

Code
# Perform t-test
t.test(followup_score ~ treatment_group, data = t_test_data)

    Welch Two Sample t-test

data:  followup_score by treatment_group
t = -0.84913, df = 74.638, p-value = 0.3985
alternative hypothesis: true difference in means between group Control and group Treatment A is not equal to 0
95 percent confidence interval:
 -5.838296  2.348822
sample estimates:
    mean in group Control mean in group Treatment A 
                 77.43787                  79.18261 

Presenting the results using gtsummary package

Code
t_test_reults <- t_test_data |>  
  select(treatment_group,followup_score) |> 
  tbl_summary(
    by = treatment_group, 
    statistic = followup_score ~ "{mean} ({sd})", 
    label = list(followup_score ~ "Score"),
    missing = c("no")) |>  
    add_p(test = all_continuous() ~ "t.test") |> 
    bold_labels() 
 
t_test_reults
Characteristic Control
N = 39
1
Treatment A
N = 55
1
p-value2
Score 77 (10) 79 (9) 0.4
1 Mean (SD)
2 Welch Two Sample t-test

We can now save the results as .docx format

Code
# save
 t_test_reults |>
   as_flex_table() |>
   flextable::save_as_docx(
     path = here::here("t_test_reults_tbl.docx")
   )

Chi-sqaure test of independence

If we want to see whether there’s an association between two categorical variables we can use the Pearson’s chi-square test, often called chi-square test of independence.

Code
# Create contingency table for sex and treatment group
contingency_table <- table(clinical_trial_data$sex, clinical_trial_data$treatment_group)

# Perform chi-square test
chisq.test(contingency_table)

    Pearson's Chi-squared test

data:  contingency_table
X-squared = 0.72972, df = 2, p-value = 0.6943

Using gtsummary()

Code
clinical_trial_data |>
  select(treatment_group, sex) |> 
  tbl_summary(by = treatment_group) |>
  add_p() |> 
  add_n()
Characteristic N Control
N = 39
1
Treatment A
N = 55
1
Treatment B
N = 56
1
p-value2
sex 150


0.7
    Female
15 (38%) 26 (47%) 24 (43%)
    Male
24 (62%) 29 (53%) 32 (57%)
1 n (%)
2 Pearson’s Chi-squared test

ANOVA

The data in dataDWL dataset. In this example we explore the variations between weight loss according to four different types of diet. The question that may be asked is: does the average weight loss differ according to the diet?

Code
library(readxl)
dataDWL <- read_excel("dataDWL.xlsx")

dataDWL <- dataDWL %>% 
  mutate(Diet = factor(Diet))
glimpse(dataDWL)
Rows: 60
Columns: 2
$ WeightLoss <dbl> 9.9, 9.6, 8.0, 4.9, 10.2, 9.0, 9.8, 10.8, 6.2, 8.3, 12.9, 1…
$ Diet       <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, B, B, B, B, B,…
Code
head(dataDWL)
# A tibble: 6 × 2
  WeightLoss Diet 
       <dbl> <fct>
1        9.9 A    
2        9.6 A    
3        8   A    
4        4.9 A    
5       10.2 A    
6        9   A    

Summary statistics

The WeightLoss summary statistics for each diet group are:

Code
DWL_summary <- dataDWL %>%
  group_by(Diet) %>%
  dplyr::summarise(
    n = n(),
    na = sum(is.na(WeightLoss)),
    min = min(WeightLoss, na.rm = TRUE),
    q1 = quantile(WeightLoss, 0.25, na.rm = TRUE),
    median = quantile(WeightLoss, 0.5, na.rm = TRUE),
    q3 = quantile(WeightLoss, 0.75, na.rm = TRUE),
    max = max(WeightLoss, na.rm = TRUE),
    mean = mean(WeightLoss, na.rm = TRUE),
    sd = sd(WeightLoss, na.rm = TRUE),
    skewness = EnvStats::skewness(WeightLoss, na.rm = TRUE),
    kurtosis= EnvStats::kurtosis(WeightLoss, na.rm = TRUE)
  ) %>%
  ungroup()

DWL_summary
# A tibble: 4 × 12
  Diet      n    na   min    q1 median    q3   max  mean    sd skewness kurtosis
  <fct> <int> <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 A        15     0   4.9  8.15    9.6  10.5  12.9  9.18  2.30  -0.471    -0.302
2 B        15     0   3.8  7.85    9.2  10.8  12.7  8.91  2.78  -0.467    -0.515
3 C        15     0   8.7 10.8    12.2  13    15.1 12.1   1.79  -0.0451   -0.530
4 D        15     0   5.8  9.5    10.5  11.8  13.7 10.5   2.23  -0.475     0.229

Run the ANOVA test

We will perform an ANOVA to test the null hypothesis that the mean weight loss is the same for all the diet groups.

Code
# Compute the analysis of variance
anova_one_way <- aov(WeightLoss ~ Diet, data = dataDWL)

# Summary of the analysis
summary(anova_one_way)
            Df Sum Sq Mean Sq F value  Pr(>F)   
Diet         3  97.33   32.44   6.118 0.00113 **
Residuals   56 296.99    5.30                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Present the results using gtsummary

Code
gt_summ_ANOVA <- dataDWL %>% 
  tbl_summary(
    by = Diet, 
    statistic = WeightLoss ~ "{mean} ({sd})", 
    digits = list(everything() ~ 1), #number of decimal places (gtsummary)
    label = list(WeightLoss ~ "Weight Loss (kg)"), 
    missing = c("no")) |> 
  add_p(test = WeightLoss ~ "aov") 

gt_summ_ANOVA
Characteristic A
N = 15
1
B
N = 15
1
C
N = 15
1
D
N = 15
1
p-value2
Weight Loss (kg) 9.2 (2.3) 8.9 (2.8) 12.1 (1.8) 10.5 (2.2) 0.001
1 Mean (SD)
2 One-way analysis of means

Post-hoc tests

A significant one-way ANOVA is generally followed up by post-hoc tests to perform multiple pairwise comparisons between groups

The output contains the following columns of interest:

  • estimate: estimate of the difference between means of the two groups

  • conf.low, conf.high: the lower and the upper end point of the confidence interval at 95% (default)

  • p.adj: p-value after adjustment for the multiple comparisons.

Pairwise Comparison

Code
# Pairwise comparisons
pwc_Tukey <- dataDWL %>% 
  tukey_hsd(WeightLoss ~ Diet)

pwc_Tukey 
# A tibble: 6 × 9
  term  group1 group2 null.value estimate conf.low conf.high   p.adj
* <chr> <chr>  <chr>       <dbl>    <dbl>    <dbl>     <dbl>   <dbl>
1 Diet  A      B               0   -0.273   -2.50      1.95  0.988  
2 Diet  A      C               0    2.93     0.707     5.16  0.00513
3 Diet  A      D               0    1.36    -0.867     3.59  0.377  
4 Diet  B      C               0    3.21     0.980     5.43  0.0019 
5 Diet  B      D               0    1.63    -0.593     3.86  0.222  
6 Diet  C      D               0   -1.57    -3.80      0.653 0.252  
# ℹ 1 more variable: p.adj.signif <chr>

Explanation

Pairwise comparisons were carried out using the method of Tukey and the adjusted p-values were calculated.The results in Tukey post hoc table show that the weight loss from diet C seems to be significantly larger than diet A (mean difference = 2.91 kg, 95%CI [0.71, 5.16], p=0.005 <0.05) and diet B (mean difference = 3.21 kg, 95%CI [0.98, 5.43], p=0.002 <0.05).

Regression Analysis in R

Linear Regression

Code
# Fit a linear regression model for follow-up score
linear_model <- lm(followup_score ~ baseline_score + age + treatment_group, data = clinical_trial_data)
summary(linear_model)

Call:
lm(formula = followup_score ~ baseline_score + age + treatment_group, 
    data = clinical_trial_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-28.3237  -6.0981   0.8911   6.7859  24.4590 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)                75.65192    7.73245   9.784   <2e-16 ***
baseline_score              0.01460    0.08400   0.174   0.8623    
age                         0.01309    0.07514   0.174   0.8620    
treatment_groupTreatment A  1.82602    2.18249   0.837   0.4042    
treatment_groupTreatment B  5.53946    2.12548   2.606   0.0101 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.15 on 145 degrees of freedom
Multiple R-squared:  0.04954,   Adjusted R-squared:  0.02332 
F-statistic:  1.89 on 4 and 145 DF,  p-value: 0.1154

Using the broom::tidy()

function to get the results in a nice tibble

Code
linear_model |> broom::tidy()
# A tibble: 5 × 5
  term                       estimate std.error statistic  p.value
  <chr>                         <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                 75.7       7.73       9.78  1.14e-17
2 baseline_score               0.0146    0.0840     0.174 8.62e- 1
3 age                          0.0131    0.0751     0.174 8.62e- 1
4 treatment_groupTreatment A   1.83      2.18       0.837 4.04e- 1
5 treatment_groupTreatment B   5.54      2.13       2.61  1.01e- 2

Using gtsummary() function tbl_regression()

Code
# tidy(linear_model)
tbl_regression(linear_model) |> 
  bold_labels() |> 
  bold_p(t = 0.05)
Characteristic Beta 95% CI1 p-value
baseline_score 0.01 -0.15, 0.18 0.9
age 0.01 -0.14, 0.16 0.9
treatment_group


    Control
    Treatment A 1.8 -2.5, 6.1 0.4
    Treatment B 5.5 1.3, 9.7 0.010
1 CI = Confidence Interval

Logistic Regression

Code
# Fit logistic regression to predict event occurrence
logistic_model <- glm(event_occurred ~ age + sex + treatment_group, data = clinical_trial_data, family = binomial)
summary(logistic_model)

Call:
glm(formula = event_occurred ~ age + sex + treatment_group, family = binomial, 
    data = clinical_trial_data)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)
(Intercept)                 0.02512    0.89167   0.028    0.978
age                         0.01190    0.01504   0.791    0.429
sexMale                    -0.46608    0.33595  -1.387    0.165
treatment_groupTreatment A -0.44221    0.43921  -1.007    0.314
treatment_groupTreatment B -0.43374    0.42684  -1.016    0.310

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 207.92  on 149  degrees of freedom
Residual deviance: 203.78  on 145  degrees of freedom
AIC: 213.78

Number of Fisher Scoring iterations: 4

Using gtsummary, function tbl_regression()

Code
gt_summ_logReg <- tbl_regression(logistic_model, exponentiate = TRUE)
gt_summ_logReg
Characteristic OR1 95% CI1 p-value
age 1.01 0.98, 1.04 0.4
sex


    Female
    Male 0.63 0.32, 1.21 0.2
treatment_group


    Control
    Treatment A 0.64 0.27, 1.51 0.3
    Treatment B 0.65 0.28, 1.49 0.3
1 OR = Odds Ratio, CI = Confidence Interval

Save as word output for sharing

Code
# save
 gt_summ_logReg |>
   as_flex_table() |>
   flextable::save_as_docx(
     path = here::here("ligisticReg_tbl.docx")
   )

Resources:

R for Data Science Book

https://argoshare.is.ed.ac.uk/healthyr_book/

gtsummary website

https://www.danieldsjoberg.com/gtsummary/

R medicine conference Gtsummary webinar

https://www.youtube.com/watch?v=p-diyV2E77o